This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
odi=read.csv(file="odi-batting.csv",header=TRUE)
library(knitr)
## Warning: package 'knitr' was built under R version 3.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(scales)
## Warning: package 'scales' was built under R version 3.4.2
library(treemapify)
## Warning: package 'treemapify' was built under R version 3.4.3
data = as.Date(odi$MatchDate, format='%m-%d-%Y')
odi$Year = format(data,'%Y')
odi$Month = format(data, '%m')
odi$Weekday = format(data, '%A')
top_players = odi %>% group_by(Player) %>% summarise(Total_Runs = sum(Runs, na.rm=TRUE), Country=first(Country)) %>% arrange(-Total_Runs)
kable(top_players[1:5, ])
| Player | Total_Runs | Country |
|---|---|---|
| Sachin R Tendulkar | 18111 | India |
| Ricky T Ponting | 13686 | Australia |
| Sanath T Jayasuriya | 13430 | Sri Lanka |
| Inzamam-ul-Haq | 11739 | Pakistan |
| Jacques H Kallis | 11372 | South Africa |
| ## Countrywise total n | umber of play | ers |
country_players = odi %>% group_by(Country) %>% summarise(Count_Players=n_distinct(Player)) %>% arrange(-Count_Players)
head(country_players)
## # A tibble: 6 x 2
## Country Count_Players
## <fctr> <int>
## 1 England 220
## 2 India 185
## 3 Australia 179
## 4 Pakistan 179
## 5 New Zealand 160
## 6 West Indies 159
odi = odi %>% mutate(ducks=if_else(Runs==0, 1, 0))
odi = odi %>% mutate(centuries=if_else(Runs>99, 1, 0))
odi = odi %>% mutate(missed=if_else(Runs>90 & Runs<100, 1, 0))
odi = odi %>% mutate(fifties=if_else(Runs>50 & Runs<100, 1, 0))
players_summary = odi %>% group_by(Player) %>% summarise(Total_Runs=sum(Runs, na.rm=TRUE), Centuries=sum(centuries, na.rm=TRUE ), ducks=sum(ducks, na.rm=TRUE), fifties=sum(fifties, na.rm=TRUE), Missed_Centuries=sum(missed, na.rm=TRUE)) %>% arrange(-Total_Runs)
kable(head(players_summary))
| Player | Total_Runs | Centuries | ducks | fifties | Missed_Centuries |
|---|---|---|---|---|---|
| Sachin R Tendulkar | 18111 | 48 | 20 | 93 | 17 |
| Ricky T Ponting | 13686 | 30 | 20 | 80 | 5 |
| Sanath T Jayasuriya | 13430 | 28 | 34 | 65 | 6 |
| Inzamam-ul-Haq | 11739 | 10 | 20 | 80 | 2 |
| Jacques H Kallis | 11372 | 17 | 17 | 81 | 8 |
| Sourav C Ganguly | 11363 | 22 | 16 | 70 | 3 |
| # Bar chart for total | runs by month |
odi %>% group_by(Month) %>% summarise(runs=sum(Runs,na.rm=T)) %>% ggplot(aes(x=Month,y=runs))+geom_bar(stat= "identity")
odi %>% filter( Player=='Sachin R Tendulkar') %>% ggplot(aes(x=Runs,y=ScoreRate))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess'
top_player=c('Sachin R Tendulkar','Virender Sehwag','Sourav C Ganguly' )
odi %>% filter(Player %in% top_player) %>% ggplot(aes(x=Runs,y=ScoreRate,color=Player))+geom_smooth()
## `geom_smooth()` using method = 'loess'
top_players = odi %>% group_by(Player) %>% summarise(Runs=sum(Runs,na.rm=TRUE)) %>% arrange(-Runs) %>% head(10)
top_players
## # A tibble: 10 x 2
## Player Runs
## <fctr> <int>
## 1 Sachin R Tendulkar 18111
## 2 Ricky T Ponting 13686
## 3 Sanath T Jayasuriya 13430
## 4 Inzamam-ul-Haq 11739
## 5 Jacques H Kallis 11372
## 6 Sourav C Ganguly 11363
## 7 Rahul Dravid 10889
## 8 Brian C Lara 10405
## 9 D P Mahela Jayawardene 9913
## 10 Mohammad Yousuf 9720
odi %>% filter(Player %in% top_players$Player) %>% ggplot(aes(x=Runs,y=ScoreRate,color=Player))+geom_smooth()
## `geom_smooth()` using method = 'loess'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
sachin= odi %>% filter(Player=="Sachin R Tendulkar") %>% group_by(Year) %>% summarise(Total_run=sum(Runs),Total_cen=sum(centuries),Avg_Runs=mean(Runs,na.rm=TRUE))
g=ggplot(sachin,aes(x=Year,y=Total_run,size=Total_cen,color=-Avg_Runs))+geom_point()
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
top 10 players top10/total runs
#install(devtools)
#install(treemapify)
indian_player=odi %>% filter(Country=='India') %>% group_by(Player) %>% summarise(Total_Runs=sum(Runs,na.rm=TRUE),Avg_SR=mean(ScoreRate,na.rm=T)) %>% arrange(-Total_Runs) %>% head(50)
indian_player
## # A tibble: 50 x 3
## Player Total_Runs Avg_SR
## <fctr> <int> <dbl>
## 1 Sachin R Tendulkar 18111 74.61213
## 2 Sourav C Ganguly 11363 64.90043
## 3 Rahul Dravid 10889 66.91160
## 4 Mohammad Azharuddin 9378 69.66844
## 5 Yuvraj Singh 8051 79.13948
## 6 Virender Sehwag 7760 93.87600
## 7 Mahendra S Dhoni 6497 85.30714
## 8 Alaysinhji D Jadeja 5359 66.71011
## 9 Navjot S Sidhu 4413 57.45315
## 10 Gautam Gambhir 4286 73.77817
## # ... with 40 more rows
g=ggplot(indian_player,aes(area=Total_Runs,label=Player,fill=Avg_SR))+geom_treemap()
g=g+geom_treemap_text()
plot(g)
#library(scales)
#scale will be between -1 to 1...so we need to check min &max vales and then fix the vales in b/w -1 to 1
#(x=c(100,10,0,60,90)
# rescale(x,c(-1,1))
# mean(x)
# min(indian_player$Avg_SR)
# max(indian_player$Avg_SR)
indian_player$Avg_SR_Scaled=rescale(indian_player$Avg_SR,c(-1,1))
g=ggplot(indian_player,aes(area=Total_Runs,label=Player,fill=-Avg_SR_Scaled))+geom_treemap()
g=g+geom_treemap_text() + scale_fill_gradient2(low="red", mid="yellow",high="green")
plot(g)
g=g+geom_treemap_text() + scale_fill_gradient2(low="#F65314", mid="#FFBB00",high="#7CBB00")
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
plot(g)
ggplot(indian_player,aes(x=Player,y=Total_Runs,fill=-Avg_SR_Scaled))+geom_bar(stat='identity')+scale_fill_gradient2(low="#F65314",mid="#FFBB00",high="#7CBB00")
0-25, 26-50, 51-75, 76-100, 100+
a=odi %>% mutate(Runs,run_bins=if_else(Runs %in% 0:25,"0-25",if_else(Runs %in% 26:50,"26-50",if_else(Runs %in% 51:75,"51-75",if_else(Runs %in% 76:100,"76-100","100+")))))
a %>% group_by(run_bins)%>% select(run_bins,Runs) %>% head(10)
## # A tibble: 10 x 2
## # Groups: run_bins [5]
## run_bins Runs
## <chr> <int>
## 1 100+ 118
## 2 100+ 110
## 3 76-100 100
## 4 76-100 82
## 5 51-75 57
## 6 51-75 55
## 7 26-50 37
## 8 26-50 34
## 9 0-25 20
## 10 0-25 16
b=a %>% group_by(run_bins) %>% summarise(frequency=n())
ggplot(b,aes(x=reorder(run_bins,-frequency),y=frequency))+geom_bar(stat="identity")
odi$Date = as.Date(odi$MatchDate, format="%m-%d-%Y")
odi$year = format(odi$Date, '%Y')
top_run=odi %>% group_by(Player) %>% summarise(tot_run=sum(Runs,na.rm=T)) %>% arrange(-tot_run) %>% head(10)
odi = odi %>% mutate(centuries=if_else(Runs>99, 1, 0))
a=odi %>% filter(Player %in% top_run$Player,centuries==1) %>%group_by(Player,year) %>% summarise(no_cen=sum(centuries))
ggplot(a, aes(x=year, y=no_cen)) + geom_bar(stat='identity') + facet_wrap(~Player)